MARVEL - Social Network Analysis

Group B

In [1]:
from IPython.display import Image                        
url='https://upload.wikimedia.org/wikipedia/commons/thumb/b/b9/Marvel_Logo.svg/1200px-Marvel_Logo.svg.png'
Image(url,width=900, height=800)
Out[1]:

Marvel counts among its characters such well-known superheroes as Spider-Man, Iron Man, Captain America, the Hulk, Thor, Wolverine, Ant-Man, the Wasp, Black Widow, Hawkeye, Captain Marvel, Black Panther, Doctor Strange, the Scarlet Witch, Quicksilver, She-Hulk, the Vision, the Falcon, the Winter Soldier, Ghost Rider, Blade, Daredevil, Luke Cage, Iron Fist, Ms. Marvel, Miles Morales, the Punisher and Deadpool. Superhero teams exist such as the Avengers, the X-Men, the Fantastic Four and the Guardians of the Galaxy."

Import and Cleaning Data

In [2]:
import pandas as pd 
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import plotly.express as px

from igraph import *

from IPython.display import display
from PIL import Image

hero_network_df = pd.read_csv('hero-network.csv', sep=',',encoding= 'unicode_escape')
graphdf = Graph.DataFrame(hero_network_df, directed=False)
print('edge count:', graphdf.ecount())
print('node count:', graphdf.vcount())
edge count: 574467
node count: 6426

Degree distribution

In [3]:
data = graphdf.get_vertex_dataframe()
data['degree'] = graphdf.degree(mode='all')
print('Maximum degree:', data["degree"].max())
print('Minimum degree:', data["degree"].min())
plt.hist(data['degree'])


## After analyzing the maximum and minimum degrees we can appreciate the majority of the actors have very low 
## degree and also see there are actors with very high degrees
Maximum degree: 16499
Minimum degree: 1
Out[3]:
(array([6.302e+03, 7.200e+01, 1.700e+01, 1.200e+01, 7.000e+00, 8.000e+00,
        5.000e+00, 1.000e+00, 1.000e+00, 1.000e+00]),
 array([1.00000e+00, 1.65080e+03, 3.30060e+03, 4.95040e+03, 6.60020e+03,
        8.25000e+03, 9.89980e+03, 1.15496e+04, 1.31994e+04, 1.48492e+04,
        1.64990e+04]),
 <a list of 10 Patch objects>)

Centrality Measures

In [4]:
data['closeness'] = graphdf.closeness()
data['betweeness'] = graphdf.betweenness()
data['pageRank'] = graphdf.pagerank()
In [15]:
## Search for highest degree among characters

data_deg = data.sort_values(by=['degree'], ascending = False)
print('Hihgest degrees characters:\n', data.head(10))
Hihgest degrees characters:
                            name  degree  closeness  betweeness  pageRank
vertex ID                                                               
0          24-HOUR MAN/EMMANUEL       5   0.263056    0.000000  0.000038
1          3-D MAN/CHARLES CHAN     148   0.472144  168.090513  0.000126
2              4-D MAN/MERCURIO     118   0.416661  347.749632  0.000099
3                       8-BALL/      21   0.317178    2.700000  0.000077
4                             A      17   0.362305    0.000000  0.000051
5                         A'YIN      55   0.363003    0.360541  0.000060
6                  ABBOTT, JACK       8   0.372869    0.000000  0.000036
7                       ABCISSA      23   0.395031    4.862415  0.000041
8                          ABEL      20   0.343539    0.040000  0.000047
9          ABOMINATION | MUTANT      48   0.345056    0.000000  0.000101
In [16]:
## Search for highest betweeness among characters

data_between = data.sort_values(by=['betweeness'], ascending = False)
print('Highest betweeness characters:\n', data.head(10))
Highest betweeness characters:
                            name  degree  closeness  betweeness  pageRank
vertex ID                                                               
0          24-HOUR MAN/EMMANUEL       5   0.263056    0.000000  0.000038
1          3-D MAN/CHARLES CHAN     148   0.472144  168.090513  0.000126
2              4-D MAN/MERCURIO     118   0.416661  347.749632  0.000099
3                       8-BALL/      21   0.317178    2.700000  0.000077
4                             A      17   0.362305    0.000000  0.000051
5                         A'YIN      55   0.363003    0.360541  0.000060
6                  ABBOTT, JACK       8   0.372869    0.000000  0.000036
7                       ABCISSA      23   0.395031    4.862415  0.000041
8                          ABEL      20   0.343539    0.040000  0.000047
9          ABOMINATION | MUTANT      48   0.345056    0.000000  0.000101
In [ ]:
 

Network Diameter and Average Path Length

In [17]:
graph_diameter = graphdf.diameter()
print('Graph Diameter:', graph_diameter)
Graph Diameter: 5
In [18]:
average_path = graphdf.average_path_length()
print('Average Path Length:', average_path)
Average Path Length: 2.638426690145394
In [19]:
### As the data set is so big, we decided to analyse the connections and graphs of the main characters 
### (50 with the higherst degrees)
 
main_char_df = data_deg.head(50)
main_char_list = main_char_df['name'].tolist() 


def is_involved_main_char(row, main_char_list):
    if row['hero1'] in main_char_list:
        if row['hero2'] in main_char_list:
                return True
    return False

hero_network_df['isMainChar'] = hero_network_df.apply(lambda row: is_involved_main_char(row, main_char_list), axis=1)
hero_network_df_main_char = hero_network_df[hero_network_df['isMainChar']==True]
hero_network_df_main_char.reset_index(inplace=True, drop=True)
In [20]:
print(len(hero_network_df_main_char.index))
72783
In [21]:
graphdf_main= Graph.DataFrame(hero_network_df_main_char, directed=False)
In [22]:
data_main = graphdf_main.get_vertex_dataframe()
In [23]:
### We can see there are no community since the relationship between characters is extremely centralized.
### We can see there are two outliers: Miss America & Patriot.

plot(graphdf_main, 
     vertex_label = graphdf_main.vs["name"],
     vertex_size = 10,
     vertex_label_dist = 1.5,
     vertex_label_size = 10,
    )
Out[23]:
In [14]:
print(len(data_main.index))
50
In [ ]: